import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
from matplotlib import pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
import scipy.stats as ss
import warnings
import joblib
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 5000)
def get_deviation_of_mean_perc(pd_loan, list_var_continuous, target, multiplier):
"""
Devuelve el porcentaje de valores que exceden del intervalo de confianza
:type series:
:param multiplier:
:return:
"""
pd_final = pd.DataFrame()
for i in list_var_continuous:
series_mean = pd_loan[i].mean()
series_std = pd_loan[i].std()
std_amp = multiplier * series_std
left = series_mean - std_amp
right = series_mean + std_amp
size_s = pd_loan[i].size
perc_goods = pd_loan[i][(pd_loan[i] >= left) & (pd_loan[i] <= right)].size/size_s
perc_excess = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size/size_s
if perc_excess>0:
pd_concat_percent = pd.DataFrame(pd_loan[target][(pd_loan[i] < left) | (pd_loan[i] > right)]\
.value_counts(normalize=True).reset_index()).T
#pd_concat_percent.columns = [pd_concat_percent.iloc[0,0],
# pd_concat_percent.iloc[0,1]]
pd_concat_percent = pd_concat_percent.drop('index',axis=0)
pd_concat_percent['variable'] = i
pd_concat_percent['sum_outlier_values'] = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size
pd_concat_percent['porcentaje_sum_null_values'] = perc_excess
pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
if pd_final.empty:
print('No existen variables con valores nulos')
return pd_final
def get_corr_matrix(dataset = None, metodo='pearson', size_figure=[10,8]):
# Para obtener la correlación de Spearman, sólo cambiar el metodo por 'spearman'
if dataset is None:
print(u'\nHace falta pasar argumentos a la función')
return 1
sns.set(style="white")
# Compute the correlation matrix
corr = dataset.corr(method=metodo)
# Set self-correlation to zero to avoid distraction
for i in range(corr.shape[0]):
corr.iloc[i, i] = 0
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=size_figure)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, center=0,
square=True, linewidths=.5, cmap ='viridis' ) #cbar_kws={"shrink": .5}
plt.show()
return 0
def plot_feature(df, col_name, isContinuous, target):
#f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12,3), dpi=90)
f, (ax1) = plt.subplots(nrows=1, ncols=1, figsize=(12,3), dpi=90)
count_null = df[col_name].isnull().sum()
if isContinuous:
sns.histplot(df.loc[df[col_name].notnull(), col_name], kde=False, ax=ax1)
else:
sns.countplot(df[col_name], order=sorted(df[col_name].unique()), color='#5975A4', saturation=1, ax=ax1)
ax1.set_xlabel(col_name)
ax1.set_ylabel('Count')
ax1.set_title(col_name + ' - '+target+' yes')
plt.xticks(rotation = 0)
plt.tight_layout()
We are presented with a dataset showing a set of credit transactions classified according to whether fraud has occurred or not. The objective of this practice is to generate a model that, given a transaction, outputs the probability that some type of fraud exists. This will be done by analyzing the data, evaluating and understanding the data and generating the model that best predicts possible future fraud.
# Data load
df_my_dictionary = pd.read_csv("../data/raw/dictionary.csv",sep=";")
df_payments_fraud= pd.read_csv("../data/raw/Copia de Original_dataset_payments_fraud.csv", sep=';')
df_payments_fraud['connection_time'] = df_payments_fraud['connection_time'].str.replace(',', '.').astype(float)
df_payments_fraud.head()
| step | type | amount | gender | device | connection_time | nameOrig | race | oldbalanceOrg | age | newbalanceOrig | zone | user_number | nameDest | user_connections | security_alert | oldbalanceDest | newbalanceDest | isFraud | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | PAYMENT | 9839.64 | man | mac | 0.140039 | C1231006815 | black | 170136.0 | 85 | 160296.36 | capital | 138 | M1979787155 | 5 | 1 | 0.0 | 0.0 | 0 |
| 1 | 1 | PAYMENT | 1864.28 | woman | mac | 0.496890 | C1666544295 | asian | 21249.0 | 57 | 19384.72 | country | 909 | M2044282225 | 1 | 0 | 0.0 | 0.0 | 0 |
| 2 | 1 | TRANSFER | 181.00 | man | pc | 0.781150 | C1305486145 | asian | 181.0 | 66 | 0.00 | capital | 2569 | C553264065 | 10 | 0 | 0.0 | 0.0 | 1 |
| 3 | 1 | CASH_OUT | 181.00 | man | mac | 0.565068 | C840083671 | black | 181.0 | 31 | 0.00 | country | 1787 | C38997010 | 3 | 0 | 21182.0 | 0.0 | 1 |
| 4 | 1 | PAYMENT | 11668.14 | unknow | mac | 0.517114 | C2048537720 | black | 41554.0 | 90 | 29885.86 | country | 3997 | M1230701703 | 8 | 0 | 0.0 | 0.0 | 0 |
The loaded data shows the information of each of the operations by means of 19 columns. Each of the columns and its information is as follows:
We use a specific datasheet to work the EDA and not to corrupt the original one.
df_payments_fraud_eda = df_payments_fraud
df_payments_fraud_eda['nameOrig'] = df_payments_fraud_eda['nameOrig'].astype(str).str[0]
df_payments_fraud_eda['nameDest'] = df_payments_fraud_eda['nameDest'].astype(str).str[0]
df_payments_fraud_eda = df_payments_fraud_eda.fillna('None')
Selecting categorical and numerical variables from the dataframe. We use the dictionary created for this practice in which after analyzing each of the variables of the datasheet, we have determined which should be considered categorical and which numerical.
lista_variables_categoricas = []
lista_variables_numericas = []
for i in range(0,len(df_my_dictionary)):
for col in df_payments_fraud_eda.columns:
if df_my_dictionary.variable[i]==col:
if df_my_dictionary.tipo[i]=='categorical':
df_payments_fraud_eda[col]=df_payments_fraud_eda[col].astype("category")
lista_variables_categoricas.append(col)
elif df_my_dictionary.tipo[i]=='numerical':
lista_variables_numericas.append(col)
df_payments_fraud_eda.dtypes
step int64 type category amount float64 gender category device category connection_time float64 nameOrig category race category oldbalanceOrg float64 age int64 newbalanceOrig float64 zone category user_number int64 nameDest category user_connections int64 security_alert category oldbalanceDest float64 newbalanceDest float64 isFraud int64 dtype: object
df_payments_fraud_eda.dtypes.to_dict()
{'step': dtype('int64'),
'type': CategoricalDtype(categories=['CASH_IN', 'CASH_OUT', 'DEBIT', 'PAYMENT', 'TRANSFER'], ordered=False),
'amount': dtype('float64'),
'gender': CategoricalDtype(categories=['None', 'man', 'unknow', 'woman'], ordered=False),
'device': CategoricalDtype(categories=['None', 'iphone', 'mac', 'pc'], ordered=False),
'connection_time': dtype('float64'),
'nameOrig': CategoricalDtype(categories=['C'], ordered=False),
'race': CategoricalDtype(categories=['None', 'asian', 'black', 'latin'], ordered=False),
'oldbalanceOrg': dtype('float64'),
'age': dtype('int64'),
'newbalanceOrig': dtype('float64'),
'zone': CategoricalDtype(categories=['None', 'africa', 'capital', 'country'], ordered=False),
'user_number': dtype('int64'),
'nameDest': CategoricalDtype(categories=['C', 'M'], ordered=False),
'user_connections': dtype('int64'),
'security_alert': CategoricalDtype(categories=[0, 1], ordered=False),
'oldbalanceDest': dtype('float64'),
'newbalanceDest': dtype('float64'),
'isFraud': dtype('int64')}
We created the correlation matrix to observe the correlation between the different categorical variables.
get_corr_matrix(dataset = df_payments_fraud_eda[lista_variables_numericas], metodo='pearson', size_figure=[10,8])
0
We can observe high correlations between 'oldbalanceDest' and 'newbalanceDest' and another high correlation between 'newbalanceOrig' and 'oldbalanceOrg'. There are also correlations between 'amount' and 'newbalanceDest' and 'amount' and 'oldbalanceDest'.
The synergies observed in the correlations matrix are shown numerically below.
We apply the correlation matrix independently to several variables to obtain synergies.
correlacion_numerica = df_payments_fraud_eda[lista_variables_numericas].corr('pearson')
new_correlacion_numerica = correlacion_numerica.abs()
new_correlacion_numerica.loc[:,:] = np.tril(new_correlacion_numerica, k=-1) # below main lower triangle of an array
new_corr = new_correlacion_numerica.stack().to_frame('correlation').reset_index().sort_values(by='correlation', ascending=False)
new_corr[new_corr['correlation']>0.2]
| level_0 | level_1 | correlation | |
|---|---|---|---|
| 53 | newbalanceOrig | oldbalanceOrg | 0.999050 |
| 87 | newbalanceDest | oldbalanceDest | 0.978401 |
| 81 | newbalanceDest | amount | 0.311942 |
| 71 | oldbalanceDest | amount | 0.215562 |
Once the synergies have been observed, we can see that transactions are fraudulent:
Analysis of the total values of the variable 'isFraud'
df_payments_fraud_2 = df_payments_fraud_eda['isFraud'].value_counts(normalize=True).mul(100).rename('percent').reset_index()
df_payments_fraud_conteo = df_payments_fraud_eda['isFraud'].value_counts().reset_index()
df_payments_fraud_pc = pd.merge(df_payments_fraud_2, df_payments_fraud_conteo, on=['index'], how='inner')
df_payments_fraud_pc
| index | percent | isFraud | |
|---|---|---|---|
| 0 | 0 | 99.89109 | 1047433 |
| 1 | 1 | 0.10891 | 1142 |
We have observed a very low value of fraudulent transactions.
fig_1 = px.pie(df_payments_fraud_pc,
names='index',
height=400,
width=600,
hole=0.4,
title='Values for variable isFraud',
values='percent',
color_discrete_sequence=['#2596be','#e28743']
)
fig_1.update_layout(legend=dict(orientation='h', yanchor='bottom', y=-0.2, xanchor='center', x=0.5))
fig_1.show()
lista_variables_categoricas
['type', 'gender', 'device', 'nameOrig', 'race', 'nameDest', 'security_alert', 'zone']
To analyze those operations that are fraudulent, we will use a datasheet in which only fraudulent transactions are shown.
df_payments_fraud_yes = df_payments_fraud_eda.loc[df_payments_fraud_eda['isFraud']==1]
df_payments_fraud_yes.security_alert = df_payments_fraud_yes.security_alert.apply(str)
for i in list(lista_variables_categoricas):
if i!='isFraud':
plot_feature(df_payments_fraud_yes, col_name=i, isContinuous=True, target='isFraud')
After analyzing the categorical variables in the cases in which fraudulent transactions occur, we can draw conclusions:
3.1. Null data analysis 3.2. Compliance 3.3. Categorical and Numerical variables 3.4. Outliers 3.5. Modelization 3.6. Coding 3.7. Normality Test 3.8. Train and test split 3.9. SMOTE 3.10. Pipeline Scaler
df_train_null_columns = df_payments_fraud.isnull().sum().sort_values(ascending=False)
df_train_null_rows = df_payments_fraud.isnull().sum(axis=1).sort_values(ascending=False)
pd_null_columnas = pd.DataFrame(df_train_null_columns, columns=['nulos_columnas'])
pd_null_filas = pd.DataFrame(df_train_null_rows, columns=['nulos_filas'])
pd_null_filas['target'] = df_payments_fraud['isFraud'].copy()
pd_null_columnas['porcentaje_columnas'] = pd_null_columnas['nulos_columnas']/df_payments_fraud.shape[0]
pd_null_filas['porcentaje_filas']= pd_null_filas['nulos_filas']/df_payments_fraud.shape[1]
pd_null_columnas.head()
| nulos_columnas | porcentaje_columnas | |
|---|---|---|
| race | 105163 | 0.100291 |
| gender | 105118 | 0.100248 |
| device | 104580 | 0.099735 |
| zone | 104414 | 0.099577 |
| step | 0 | 0.000000 |
When identifying null values in categorical variables, we chose to replace null values with the term 'None'.
df_payments_fraud = df_payments_fraud.fillna('None')
In order to develop a model that complies with race non-discrimination regulations, we opted to eliminate the 'race' and 'gender' column.
del df_payments_fraud['race']
del df_payments_fraud['gender']
We use the dictionary created for this practice in which after analyzing each of the variables of the datasheet, we have determined which should be considered categorical and which numerical
lista_variables_categoricas_target = []
lista_variables_numericas_target= []
for i in range(0,len(df_my_dictionary)):
for col in df_payments_fraud.columns:
if df_my_dictionary.variable[i]==col:
if df_my_dictionary.tipo[i]=='categorical':
df_payments_fraud[col]=df_payments_fraud[col].astype("category")
lista_variables_categoricas_target.append(col)
elif df_my_dictionary.tipo[i]=='numerical':
lista_variables_numericas_target.append(col)
df_payments_fraud.dtypes
step int64 type category amount float64 device category connection_time float64 nameOrig category oldbalanceOrg float64 age int64 newbalanceOrig float64 zone category user_number int64 nameDest category user_connections int64 security_alert category oldbalanceDest float64 newbalanceDest float64 isFraud int64 dtype: object
We apply the formula with a multiplier of 1.5 to observe possible outliers. The reason for this multiplier is due to the fact that the numerical variables only receive a small range of values.
df_payments_fraud_outliers = df_payments_fraud
df_payments_fraud_outliers[lista_variables_categoricas_target] = df_payments_fraud_outliers[lista_variables_categoricas_target].astype("category")
list_var_continuous = list(df_payments_fraud_outliers.select_dtypes('int').columns)
df_payments_fraud_outliers[list_var_continuous] = df_payments_fraud_outliers[list_var_continuous].astype(int)
list_outliers_train = get_deviation_of_mean_perc(df_payments_fraud_outliers, list_var_continuous, target='isFraud', multiplier=1.5)
list_outliers_train = list_outliers_train.loc[:,['variable', 0,1,'sum_outlier_values','porcentaje_sum_null_values']]
list_outliers_train
| variable | 0 | 1 | sum_outlier_values | porcentaje_sum_null_values | |
|---|---|---|---|---|---|
| 0 | step | 0.974320 | 0.025680 | 22586 | 0.021540 |
| 1 | age | 0.999072 | 0.000928 | 131463 | 0.125373 |
| 2 | user_number | 0.998826 | 0.001174 | 140512 | 0.134003 |
| 3 | user_connections | 0.998883 | 0.001117 | 209531 | 0.199825 |
| 4 | isFraud | 1.000000 | NaN | 1142 | 0.001089 |
We have identified in columns 'nameOrig' and 'nameDest' that the first character of the string references the type origin and destination as Customer 'C' or Merchant 'M'
# Copy of the original df
df_target = df_payments_fraud
# First character of the string
df_target['nameOrig'] = df_target['nameOrig'].astype(str).str[0]
df_target['nameDest'] = df_target['nameDest'].astype(str).str[0]
one_hot_type = pd.get_dummies(df_target['type'], prefix='type')
one_hot_device = pd.get_dummies(df_target['device'], prefix='device')
one_hot_nameOrig = pd.get_dummies(df_target['nameOrig'], prefix='nameOrig')
one_hot_nameDest = pd.get_dummies(df_target['nameDest'], prefix='nameDest')
one_hot_security_alert = pd.get_dummies(df_target['security_alert'], prefix='security_alert')
one_hot_security_zone = pd.get_dummies(df_target['zone'], prefix='zone')
df_target = df_target.drop('type',axis = 1)
df_target = df_target.drop('device',axis = 1)
df_target = df_target.drop('nameOrig',axis = 1)
df_target = df_target.drop('nameDest',axis = 1)
df_target = df_target.drop('security_alert',axis = 1)
df_target = df_target.drop('zone',axis = 1)
df_target = df_target.join(one_hot_type)
df_target = df_target.join(one_hot_device)
df_target = df_target.join(one_hot_nameOrig)
df_target = df_target.join(one_hot_nameDest)
df_target = df_target.join(one_hot_security_alert)
df_target = df_target.join(one_hot_security_zone)
In order to evaluate the distribution of the data I perform a Shapiro-Wilk test. The result determines the scaler I will use on the data.
ss.shapiro(df_target)
ShapiroResult(statistic=0.12307161092758179, pvalue=0.0)
Pvalue is smaller than 0.05, therfore I reject the null hypothesis and decide to implement a MinMaxScaler to transform the data.
X = df_target.drop('isFraud', axis=1)
y = df_target['isFraud']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y)
SMOTE is an oversampling technique to handle imbalanced class problem. SMOTE is an algorithm that performs data augmentation by creating synthetic data points based on the original data points.
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
smote_scaler_transformer = Pipeline(steps=[
('scaler', MinMaxScaler())])
smote_scaler_transformer.fit(X_train_smote)
Pipeline(steps=[('scaler', MinMaxScaler())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('scaler', MinMaxScaler())])MinMaxScaler()
4.1. Base Model 4.2. LogisticRegression + Lasso 4.3. GLM 4.4. SVM 4.5. RandomForest 4.6. XGBBoost 4.7. LightGBM 4.8. Other models
DummyClassifier makes predictions that ignore the input features. This classifier serves as a simple baseline to compare against other more complex classifiers.
from sklearn.dummy import DummyClassifier
dum = DummyClassifier()
dum.fit(X_train_smote, y_train_smote)
joblib.dump(dum, '../models/DummyClassifier.joblib')
['../models/DummyClassifier.joblib']
pred_dum = joblib.load('../models/DummyClassifier.joblib')
y_pred_dum = pred_dum.predict(X_test)
cnf_matrix_dum = metrics.confusion_matrix(y_test, y_pred_dum, normalize='true')
plt.figure(figsize=(6,6))
sns.heatmap(cnf_matrix_dum, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
target_names = ['0', '1']
dummy_score_0 = cnf_matrix_dum[1,0]
dummy_score_1 = cnf_matrix_dum[0,1]
print(classification_report(y_test, y_pred_dum, target_names=target_names))
precision recall f1-score support
0 1.00 1.00 1.00 209487
1 0.00 0.00 0.00 228
accuracy 1.00 209715
macro avg 0.50 0.50 0.50 209715
weighted avg 1.00 1.00 1.00 209715
As we can see, the model is not useful.
LogisticRegression Model using L1 as the penalty
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(C=1, penalty='l1', solver='liblinear')
log_reg.fit(X_train_smote, y_train_smote)
joblib.dump(log_reg, '../models/LogisticRegression.joblib')
['../models/LogisticRegression.joblib']
pred_log_reg = joblib.load('../models/LogisticRegression.joblib')
y_pred_log_reg = pred_log_reg.predict(X_test)
cnf_matrix_log_reg = metrics.confusion_matrix(y_test, y_pred_log_reg, normalize='true')
plt.figure(figsize=(6,6))
sns.heatmap(cnf_matrix_log_reg, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
target_names = ['0', '1']
log_reg_score_0 = cnf_matrix_log_reg[1,0]
log_reg_score_1 = cnf_matrix_log_reg[0,1]
print(classification_report(y_test, y_pred_log_reg, target_names=target_names))
precision recall f1-score support
0 1.00 1.00 1.00 209487
1 0.19 0.70 0.30 228
accuracy 1.00 209715
macro avg 0.59 0.85 0.65 209715
weighted avg 1.00 1.00 1.00 209715
We obtain fairly accurate values for non-fraud, but we obtain a high number of false negatives, which are very relevant to our objective.
For the GLM Model we use the GaussianNB Model. Naive Bayes methods are a set of supervised learning algorithms based on applying Bayes’ theorem with the “naive” assumption of conditional independence between every pair of features given the value of the class variable.
from sklearn.naive_bayes import GaussianNB
glm = GaussianNB()
glm.fit(X_train_smote, y_train_smote)
joblib.dump(glm, '../models/GaussianNB.joblib')
['../models/GaussianNB.joblib']
pred_glm= joblib.load('../models/GaussianNB.joblib')
y_pred_glm = pred_glm.predict(X_test)
cnf_matrix_glm = metrics.confusion_matrix(y_test, y_pred_glm, normalize='true')
plt.figure(figsize=(6,6))
sns.heatmap(cnf_matrix_glm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
target_names = ['0', '1']
glm_score_0 = cnf_matrix_glm[1,0]
glm_score_1 = cnf_matrix_glm[0,1]
print(classification_report(y_test, y_pred_glm, target_names=target_names))
precision recall f1-score support
0 1.00 0.24 0.38 209487
1 0.00 0.96 0.00 228
accuracy 0.24 209715
macro avg 0.50 0.60 0.19 209715
weighted avg 1.00 0.24 0.38 209715
We obtain fairly accurate values for fraud, but we obtain a high number of false positives, which are relevant to our objective.
This estimator implements regularized linear models with stochastic gradient descent (SGD) learning: the gradient of the loss is estimated each sample at a time and the model is updated along the way with a decreasing strength schedule.
from sklearn.linear_model import SGDClassifier
svm = SGDClassifier()
svm.fit(X_train_smote, y_train_smote)
joblib.dump(svm, '../models/SGDClassifier.joblib')
['../models/SGDClassifier.joblib']
pred_svm = joblib.load('../models/SGDClassifier.joblib')
y_pred_svm = pred_svm.predict(X_test)
cnf_matrix_svm = metrics.confusion_matrix(y_test, y_pred_svm, normalize='true')
plt.figure(figsize=(6,6))
sns.heatmap(cnf_matrix_svm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
target_names = ['0', '1']
svm_score_0 = cnf_matrix_svm[1,0]
svm_score_1 = cnf_matrix_svm[0,1]
print(classification_report(y_test, y_pred_svm, target_names=target_names))
precision recall f1-score support
0 1.00 0.96 0.98 209487
1 0.02 0.80 0.05 228
accuracy 0.96 209715
macro avg 0.51 0.88 0.51 209715
weighted avg 1.00 0.96 0.98 209715
We observed a very interesting model for the objective we are looking for, which is the one that maximizes the frauds obtained and at the same time does not contain high levels of false negatives.
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(max_depth=5,n_jobs=1)
random_forest.fit(X_train_smote,y_train_smote)
joblib.dump(random_forest, '../models/RandomForestClassifier.joblib')
['../models/RandomForestClassifier.joblib']
pred_random_forest = joblib.load('../models/RandomForestClassifier.joblib')
y_pred_random_forest = pred_random_forest.predict(X_test)
cnf_matrix_random_forest = metrics.confusion_matrix(y_test, y_pred_random_forest, normalize='true')
plt.figure(figsize=(6,6))
sns.heatmap(cnf_matrix_random_forest, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
target_names = ['0', '1']
random_forest_score_0 = cnf_matrix_random_forest[1,0]
random_forest_score_1 = cnf_matrix_random_forest[0,1]
print(classification_report(y_test, y_pred_random_forest, target_names=target_names))
precision recall f1-score support
0 1.00 0.97 0.98 209487
1 0.03 0.89 0.06 228
accuracy 0.97 209715
macro avg 0.51 0.93 0.52 209715
weighted avg 1.00 0.97 0.98 209715
We observed a very interesting model for the objective we are looking for, which is the one that maximizes the frauds obtained and at the same time does not contain high levels of false negatives.
import xgboost as xgb
xgboost = xgb.XGBClassifier()
xgboost.fit(X_train_smote,y_train_smote)
joblib.dump(xgboost, '../models/XGBClassifier.joblib')
[23:02:31] WARNING: /var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_44tbtwf8c1/croots/recipe/xgboost-split_1659548960882/work/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
['../models/XGBClassifier.joblib']
pred_xgboost = joblib.load('../models/XGBClassifier.joblib')
y_pred_xgboost = pred_xgboost.predict(X_test)
cnf_matrix_xgboost = metrics.confusion_matrix(y_test, y_pred_xgboost, normalize='true')
plt.figure(figsize=(6,6))
sns.heatmap(cnf_matrix_xgboost, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
target_names = ['0', '1']
xgboost_score_0 = cnf_matrix_xgboost[1,0]
xgboost_score_1= cnf_matrix_xgboost[0,1]
print(classification_report(y_test, y_pred_xgboost, target_names=target_names))
precision recall f1-score support
0 1.00 1.00 1.00 209487
1 0.94 0.79 0.86 228
accuracy 1.00 209715
macro avg 0.97 0.89 0.93 209715
weighted avg 1.00 1.00 1.00 209715
We observed a very interesting model for the objective we are looking for, this particular model does not contain any false positive which is valuable for the objective of the model.
import lightgbm as lgb
lightgbm = lgb.LGBMClassifier()
lightgbm.fit(X_train_smote,y_train_smote)
joblib.dump(lightgbm, '../models/LGBMClassifier.joblib')
['../models/LGBMClassifier.joblib']
pred_lightgbm = joblib.load('../models/LGBMClassifier.joblib')
y_pred_lightgbm = pred_lightgbm.predict(X_test)
cnf_matrix_lightgbm = metrics.confusion_matrix(y_test, y_pred_lightgbm, normalize='true')
plt.figure(figsize=(6,6))
sns.heatmap(cnf_matrix_lightgbm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
target_names = ['0', '1']
lightgbm_score_0 = cnf_matrix_lightgbm[1,0]
lightgbm_score_1 = cnf_matrix_lightgbm[0,1]
print(classification_report(y_test, y_pred_lightgbm, target_names=target_names))
precision recall f1-score support
0 1.00 1.00 1.00 209487
1 0.70 0.81 0.75 228
accuracy 1.00 209715
macro avg 0.85 0.90 0.88 209715
weighted avg 1.00 1.00 1.00 209715
We observed a very interesting model, this particular model contains a very low level of false positives but on the other hand has a high level of false negatives.
from catboost import CatBoostClassifier
clf = CatBoostClassifier(iterations=5,learning_rate=0.1)
clf.fit(X_train_smote,y_train_smote)
joblib.dump(clf, '../models/CatBoostClassifier.joblib')
0: learn: 0.4988399 total: 163ms remaining: 651ms 1: learn: 0.3712376 total: 259ms remaining: 388ms 2: learn: 0.2850919 total: 350ms remaining: 233ms 3: learn: 0.2238065 total: 447ms remaining: 112ms 4: learn: 0.1882657 total: 538ms remaining: 0us
['../models/CatBoostClassifier.joblib']
pred_clf = joblib.load('../models/CatBoostClassifier.joblib')
y_pred_clf = pred_clf.predict(X_test)
cnf_matrix_clf = metrics.confusion_matrix(y_test, y_pred_clf, normalize='true')
plt.figure(figsize=(6,6))
sns.heatmap(cnf_matrix_clf, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
target_names = ['0', '1']
clf_score_0 = cnf_matrix_clf[1,0]
clf_score_1 = cnf_matrix_clf[0,1]
print(classification_report(y_test, y_pred_clf, target_names=target_names))
precision recall f1-score support
0 1.00 0.97 0.99 209487
1 0.04 0.97 0.07 228
accuracy 0.97 209715
macro avg 0.52 0.97 0.53 209715
weighted avg 1.00 0.97 0.98 209715
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train_smote,y_train_smote)
joblib.dump(gbc, '../models/GradientBoostingClassifier.joblib')
['../models/GradientBoostingClassifier.joblib']
pred_gbc = joblib.load('../models/GradientBoostingClassifier.joblib')
y_pred_gbc = pred_gbc.predict(X_test)
cnf_matrix_gbc = metrics.confusion_matrix(y_test, y_pred_gbc, normalize='true')
plt.figure(figsize=(6,6))
sns.heatmap(cnf_matrix_gbc, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
target_names = ['0', '1']
gbc_score_0 = cnf_matrix_gbc[1,0]
gbc_score_1 = cnf_matrix_gbc[0,1]
print(classification_report(y_test, y_pred_gbc, target_names=target_names))
precision recall f1-score support
0 1.00 1.00 1.00 209487
1 0.30 0.86 0.44 228
accuracy 1.00 209715
macro avg 0.65 0.93 0.72 209715
weighted avg 1.00 1.00 1.00 209715
For the selection of the model, we have to take into account that this model can be put into production in a real situation of fraud detection. Therefore, in case of being the bank in charge of transactions, it is very important to minimize the number of false negatives, since these are the ones that have the worst economic impact on the bank.
At the same time, in order not to generate a bad experience for the customers, we cannot obtain a large number of false positives. Therefore, we will now evaluate the results of false negatives obtained by the models carried out.
scores_0 = [['Dummy', dummy_score_0], ['LogisticRegression', log_reg_score_0], ['GLM', glm_score_0],['SVM', svm_score_0],['RandomForest',random_forest_score_0],['XGBBoost',xgboost_score_0],['Lightgbm',lightgbm_score_0],['CatBoost',clf_score_0],['GradientBoosting',gbc_score_0]]
df_scores_0 = pd.DataFrame(scores_0, columns=['Model','Score'])
df_scores_0 = df_scores_0.sort_values('Score',ascending=True)
plt.figure(figsize=(15,6))
width = 0.75 # the width of the bars
fig = plt.bar(df_scores_0.Model, df_scores_0.Score, width)
plt.title('Score Comparation')
for i,j in zip(df_scores_0.Model,df_scores_0.Score):
plt.annotate(str(round(j, 2)),xy=(i,j))
scores_1 = [['Dummy', dummy_score_1], ['LogisticRegression', log_reg_score_1], ['GLM', glm_score_1],['SVM', svm_score_1],['RandomForest',random_forest_score_1],['XGBBoost',xgboost_score_1],['Lightgbm',lightgbm_score_1],['CatBoost',clf_score_1],['GradientBoosting',gbc_score_1]]
df_scores_1 = pd.DataFrame(scores_1, columns=['Model','Score'])
df_scores_1 = df_scores_1.sort_values('Score',ascending=True)
plt.figure(figsize=(15,6))
width = 0.75 # the width of the bars
fig = plt.bar(df_scores_1.Model, df_scores_1.Score, width)
plt.title('Score Comparation')
for i,j in zip(df_scores_1.Model,df_scores_1.Score):
plt.annotate(str(round(j, 2)),xy=(i,j))
We note that among the most favorable models for our objective, the GLM, CatBoost and RandomForest models show a low level of false positives, but the GLM has a high level of false positives. Therefore, after analyzing the metrics of the CatBoost and RandomForest models, the optimal model for our practice is RandomForest.
import shap
shap.initjs()
row_to_show_1 = 40
row_to_show_2 = 70
data_for_prediction_1 = X_test.iloc[row_to_show_1]
data_for_prediction_2 = X_test.iloc[row_to_show_2]
data_for_prediction_array_1 = data_for_prediction_1.values.reshape(1, -1)
data_for_prediction_array_2 = data_for_prediction_2.values.reshape(1, -1)
random_forest.predict_proba(data_for_prediction_array_1)
array([[0.78988525, 0.21011475]])
random_forest.predict_proba(data_for_prediction_array_2)
array([[0.59072379, 0.40927621]])
# Create object that can calculate shap values
explainer = shap.TreeExplainer(random_forest)
# Calculate Shap values
shap_values_1 = explainer.shap_values(data_for_prediction_1)
shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values_1[1], data_for_prediction_1)
We note that this transaction has a low probability of being fraudulent since the difference in account balances is not very suspicious of possible fraud.
# Calculate Shap values
shap_values_2 = explainer.shap_values(data_for_prediction_2)
shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values_2[1], data_for_prediction_2)
Unlike the previous transaction, this one has a higher probability of being fraudulent mainly due to the original balance contained in the destination account.
data_for_prediction_1
step 44.000000 amount 49151.800000 connection_time 0.627846 oldbalanceOrg 51094.890000 age 99.000000 newbalanceOrig 1943.090000 user_number 1463.000000 user_connections 8.000000 oldbalanceDest 218398.470000 newbalanceDest 267550.270000 type_CASH_IN 0.000000 type_CASH_OUT 1.000000 type_DEBIT 0.000000 type_PAYMENT 0.000000 type_TRANSFER 0.000000 device_None 0.000000 device_iphone 0.000000 device_mac 0.000000 device_pc 1.000000 nameOrig_C 1.000000 nameDest_C 1.000000 nameDest_M 0.000000 security_alert_0 1.000000 security_alert_1 0.000000 zone_None 0.000000 zone_africa 1.000000 zone_capital 0.000000 zone_country 0.000000 Name: 970860, dtype: float64
data_for_prediction_2
step 1.400000e+01 amount 2.153725e+04 connection_time 8.958048e-01 oldbalanceOrg 1.249300e+04 age 2.100000e+01 newbalanceOrig 0.000000e+00 user_number 6.760000e+02 user_connections 7.000000e+00 oldbalanceDest 1.500395e+06 newbalanceDest 1.521932e+06 type_CASH_IN 0.000000e+00 type_CASH_OUT 1.000000e+00 type_DEBIT 0.000000e+00 type_PAYMENT 0.000000e+00 type_TRANSFER 0.000000e+00 device_None 0.000000e+00 device_iphone 0.000000e+00 device_mac 0.000000e+00 device_pc 1.000000e+00 nameOrig_C 1.000000e+00 nameDest_C 1.000000e+00 nameDest_M 0.000000e+00 security_alert_0 1.000000e+00 security_alert_1 0.000000e+00 zone_None 0.000000e+00 zone_africa 1.000000e+00 zone_capital 0.000000e+00 zone_country 0.000000e+00 Name: 225694, dtype: float64
After studying the data, modeling them and obtaining the optimal prediction model, we can conclude that for a case of fraud in a real practice, those transactions in which the destination accounts that originally had a value of 0, or those accounts of origin that after the transaction remain at 0, have a high probability of being fraudulent.
It can also be concluded that there is no evidence of fraudulent transactions between Clients and Merchants, but that fraud only occurs between Client to Client.
Finally, it is observed that the security alert issued by the authority is not being effective since most frauds occur without the existence of the alert.